In [1]:
# Import libraries
import os
import pandas as pd
import fitz 
from PIL import Image, ImageDraw, ImageEnhance
import layoutparser as lp
import numpy as np
import re

import cv2
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
image_path = 'Pictures_1972/1972alegrete.pdf_page_1.png'
image = cv2.imread(image_path)
plt.imshow(image)
Out[3]:
<matplotlib.image.AxesImage at 0x12dff0810>
In [4]:
#fix contrast to make image clearer
image = cv2.convertScaleAbs(image, alpha = 1.3)
In [28]:
# view with updates contrast
cv2.imshow('img', image)
cv2.waitKey(0)
cv2.destroyAllWindows()
cv2.waitKey(1) 
Out[28]:
-1

Google Cloud Vision OCR¶

In [5]:
#Initialize GCV
ocr_agent2 = lp.GCVAgent.with_credential("gcv_credentials.json", 
                                        languages = ['por'])
In [6]:
#Run on image
res = ocr_agent2.detect(image, return_response=True)
In [7]:
 # collect all the layout elements at the word level
text = ocr_agent2.gather_full_text_annotation(res, agg_level=lp.GCVFeatureType.WORD)
#Visualize OCR
lp.draw_text(image, text, font_size=50, with_box_on_text=True,
             text_box_width=25)
Out[7]:
In [8]:
#get text
filtered_text = text.filter_by(lp.Rectangle(x_1=200, y_1=800, x_2=2450, y_2=1880))

#visualise
lp.draw_text(image, filtered_text, font_size=40, with_box_on_text=True, width = 1)
Out[8]:
In [9]:
#get positions
position = text.filter_by(
lp.Rectangle(x_1=200, y_1=700, x_2=400, y_2=1880))
#draw text boxes
lp.draw_text(image, position, font_size=50, with_box_on_text=True, width = 25)
Out[9]:
In [10]:
names = text.filter_by(
lp.Rectangle(x_1=350, y_1=800, x_2=1200, y_2=1880))
lp.draw_text(image, names, font_size=50, with_box_on_text=True, width = 25)
Out[10]:
In [11]:
party = text.filter_by(
lp.Rectangle(x_1=1300, y_1=800, x_2=1750, y_2=1850))
lp.draw_text(image, party, font_size=50, with_box_on_text=True, width = 25)
Out[11]:
In [12]:
nums = text.filter_by(
lp.Rectangle(x_1=1850, y_1=800, x_2=2450, y_2=1850))

lp.draw_text(image, nums, font_size=50, with_box_on_text=True, width = 25)
Out[12]:

Cleaning and Convert to DF¶

In [13]:
# create function to group text within each column
def group_blocks_by_distance(blocks, distance_th):
    # Sort the blocks vertically from top to bottom: blocks = sorted(blocks, key = lambda x: x.coordinates[1])
    # Calculate the distances:
    # y coord for the upper edge of the bottom block -
    # y coord for the bottom edge of the upper block
    distances = np.array([b2.coordinates[1] - b1.coordinates[3] for (b1, b2) in zip(blocks, blocks[1:])])
    
    # Append a placeholder for the first word
    distances = np.append([0], distances)
    # Create a block_group based on the distance threshold
    block_group = (distances>distance_th).cumsum()
    grouped_blocks = [lp.Layout([]) for i in range(max(block_group)+1)]
    
    for i, block in zip(block_group, blocks):
        grouped_blocks[i].append(block)
    return grouped_blocks
In [14]:
# group text within first two columns
A = group_blocks_by_distance(position, 0)
B = group_blocks_by_distance(names, 0)
# group rows between columns by aligning y coordinates
height_th = 50
idxA, idxB = 0, 0
result1 = []
while idxA < len(A) and idxB < len(B):
    ay = A[idxA][0].coordinates[1]
    by = B[idxB][0].coordinates[1]
    ares, bres = ' '.join(A[idxA].get_texts()), ' '.join(B[idxB].get_texts())
    if abs(ay - by) < height_th:
        idxA += 1; idxB += 1
    elif ay < by:
        idxA += 1; bres = ''
    else:
        idxB += 1; ares = ''
    result1.append([ares, bres])
    
result1
Out[14]:
[['', 'NOME'],
 ['Р', 'Adão Ortiz Houayek'],
 ['-', 'Airtom Pacheco do Amaral'],
 ['·', 'Brasilêncio Lopes Machado ·'],
 ['', 'José Rubens Pillar'],
 ['', 'Ary Faria Marimon'],
 ['- VP', 'Claudio Amildon Rosso .']]
In [15]:
# group text within second 2 columns
C = group_blocks_by_distance(party, 0)[1:]
D = group_blocks_by_distance(nums, 0)[1:]
# group rows between columns by aligning y coordinates
height_th = 50
idxC, idxD = 0, 0
result2 = []
while idxC < len(C) and idxD < len(D):
    cy = C[idxC][0].coordinates[1]
    dy = D[idxD][0].coordinates[1]
    cres, dres = ''.join(C[idxC].get_texts()), ''.join(D[idxD].get_texts())
    if abs(cy - dy) < height_th:
        idxC += 1; idxD += 1
    elif cy < dy:
        idxC += 1; dres = ''
    else:
        idxD += 1; cres = ''
    result2.append([cres, dres])
    
result2
Out[15]:
[['SUBLEGENDA', 'OBTIDA'],
 ['MDB', '11911'],
 ['ARENA-2', '7360'],
 ['ARENA-1', '3193']]
In [16]:
#adjust second result to follow format of position and names
result2_adj = [result2[0]] + [item for sublist in result2[1:] for item in [sublist, sublist]]
result2_adj
Out[16]:
[['SUBLEGENDA', 'OBTIDA'],
 ['MDB', '11911'],
 ['MDB', '11911'],
 ['ARENA-2', '7360'],
 ['ARENA-2', '7360'],
 ['ARENA-1', '3193'],
 ['ARENA-1', '3193']]
In [17]:
#combine results
combined = [r1 + r2 for r1, r2 in zip(result1, result2_adj)]
combined
Out[17]:
[['', 'NOME', 'SUBLEGENDA', 'OBTIDA'],
 ['Р', 'Adão Ortiz Houayek', 'MDB', '11911'],
 ['-', 'Airtom Pacheco do Amaral', 'MDB', '11911'],
 ['·', 'Brasilêncio Lopes Machado ·', 'ARENA-2', '7360'],
 ['', 'José Rubens Pillar', 'ARENA-2', '7360'],
 ['', 'Ary Faria Marimon', 'ARENA-1', '3193'],
 ['- VP', 'Claudio Amildon Rosso .', 'ARENA-1', '3193']]
In [18]:
# Convert to dataframe
if len(combined[1:]) > 0:
    df = pd.DataFrame(combined[1:], columns=["position", "name", "party", "votes"])
    df['flag'] = 0
else:
    df = pd.DataFrame(columns=["position", "name", "party", "votes", "flag"])
    df.loc[0] = ['', '', '', '', 1]
    
# Add municipality column
municipality = re.findall(r'/\d+(.+)\.pdf', image_path)
if municipality:
    df['municipality'] = municipality[0]
else:
    df['municipality'] = ''

# Add year column
year = re.findall(r'/(\d+)', image_path)
if year:
    df['year'] = year[0]
else:
    df['year'] = ''
df
Out[18]:
position name party votes flag municipality year
0 Р Adão Ortiz Houayek MDB 11911 0 alegrete 1972
1 - Airtom Pacheco do Amaral MDB 11911 0 alegrete 1972
2 · Brasilêncio Lopes Machado · ARENA-2 7360 0 alegrete 1972
3 José Rubens Pillar ARENA-2 7360 0 alegrete 1972
4 Ary Faria Marimon ARENA-1 3193 0 alegrete 1972
5 - VP Claudio Amildon Rosso . ARENA-1 3193 0 alegrete 1972